library(tidyverse)
library(rmarkdown) # You need this library to run this template.
library(epuRateFlo)
library(xlsx)
library(data.table)
library(ggplot2)
library(plotly)
library(lubridate) #date
library(DT)
library(ggmap)
library(broom)
library(rgdal)
library(httr)
library(nycmaps)
library(maps)
library(knitr)
library(kableExtra)
options(scipen = 999)
Data includes pickups information (latitude, longitude, time, etc.) from Uber and 10 other for-hire vehicle (FHV) companies in the New York City area. This analysis will focus on the July-September 2014 period. Note that more data is available and would require a more thorough analysis.
How to use this html file: a “code” button next to the title of this file (top right) gives you the option to show or hide all the code including in this file; alternatively, you can show code for each section separately with the specific “code” button.
Table of contents is available on the left.
# importing the first sheet of the data
aggregate = read.xlsx("uber-tlc-foil-response-master/Aggregate FHV Data.xlsx",1,check.names=FALSE)
# adding day of the week
#aggregate$Day = weekdays(as.Date(aggregate$Date))
#aggregate$Day=factor(aggregate$Day, levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday", "Sunday"))
# We combine 9 companies by suming up the number of rides from each company
#aggregate$other9 = apply(aggregate[,2:10],1,sum)
In this section, we are looking into the aggregate data from different FHV:
American, Carmel, Dial 7, Diplo, Firstclass, Highclass, Prestige, Skyline, Lyft, Uber, Yellow Taxis, Green Taxis.
We first combine the data from 9 FHV companies (American, Carmel, Dial 7, Diplo, Firstclass, Highclass, Prestige, Skyline, Lyft) as these companies have low number of rides per day.
# We combine 9 companies by suming up the number of rides from each company
aggregate$other9 = apply(aggregate[,2:10],1,sum)
## plot uber data over time
# shows 2 spikes and weekdays/weekends
ggplot(data = aggregate, aes(x = Date, y = Uber))+
geom_line(color=color.mixo(1))
# adding day of the week
aggregate$Day = weekdays(as.Date(aggregate$Date))
aggregate$Day=factor(aggregate$Day, levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday", "Sunday"))
We plot the number of rides over the July-September 2014 period per day of the week. The following graphs are interactive and hoovering gives you the number of rides for each data point.
# Uber: Sundays+Mondays vs Fridays+Saturdays, over September
ind_month=which(month(aggregate$Date) == 9 & day(aggregate$Date) > 5)
FS = ind_month[which(aggregate$Day[ind_month] %in% c("Friday", "Saturday"))]
SM = ind_month[which(aggregate$Day[ind_month] %in% c("Sunday", "Monday"))]
FS_mean=mean(aggregate$Uber[FS])
SM_mean=mean(aggregate$Uber[SM])
#signif((1-SM_mean/FS_mean)*100,2)
# Gren taxis: Saturdays vs Mondays, over September
ind_month=which(month(aggregate$Date) == 9 & day(aggregate$Date) > 5)
S = ind_month[which(aggregate$Day[ind_month] %in% c("Saturday"))]
M = ind_month[which(aggregate$Day[ind_month] %in% c("Monday"))]
S_mean=mean(aggregate$`Green Taxis`[S])
M_mean=mean(aggregate$`Green Taxis`[M])
#signif((1-M_mean/S_mean)*100,2)
The main observations are the following:
myplot = ggplot(data = aggregate, aes(x = Date, y = Uber, group=Day, colour=Day))+
geom_line() + ylab("Trips per day") + ggtitle("Uber")
ggplotly(myplot)
myplot = ggplot(data = aggregate, aes(x = Date, y = `Yellow Taxis`, group=Day, colour=Day))+
geom_line() + ylab("Trips per day") + ggtitle("Yellow Taxis")
ggplotly(myplot)
myplot = ggplot(data = aggregate, aes(x = Date, y = `Green Taxis`, group=Day, colour=Day))+
geom_line() + ylab("Trips per day") + ggtitle("Green Taxis")
ggplotly(myplot)
myplot = ggplot(data = aggregate, aes(x = Date, y = other9, group=Day, colour=Day))+
geom_line() + ylab("Trips per day") + ggtitle("Other 9")
ggplotly(myplot)
In this section, we are looking at the number of rides per hour, averaged over the 3 months period (July-September 2014). There is no data available for the Yellow and Green Taxis; we display data for Uber, Lyft and a combination of the remaining 8 FHV companies.
trips = read.xlsx("uber-tlc-foil-response-master/Aggregate FHV Data.xlsx",2, startRow=2, endRow=170)
# table(trips$Weekday) # check right number of entry per day
df= trips %>% group_by(Hour) %>% summarize_all(list(mean))
# wide to long data
df2=melt(df, id.vars=c("Weekday","Hour"))
df2$variable = factor(df2$variable, levels=c("Uber","Lyft","other.8.bases"), labels=c("Uber","Lyft","other8"))
myplot= ggplot(df2, aes(x = Hour, y=value, group=variable,color=variable)) + geom_line() +
scale_color_discrete(name="")+ ylab("")+
ggtitle ("Average number of trips per hour")
ggplotly(myplot)
We observe that pick hours for Uber is around 17H (5pm), while morning rush hour seems to be around 8am.
The other 8 companies on aggregate also have a pick hour at 8am, while they do not appear to have an afternoon rush.
In contrast, Lyft seems to have more customers at night, between 8pm and 5am.
In this section, we are having a quick look at the latitude and longitude of Uber pickups over the July-September 2014 period.
# combine the 3months data into one dataset
X = NULL
for(add in c("jul","aug","sep")){
temp = fread(paste0("uber-tlc-foil-response-master/uber-trip-data/uber-raw-data-",add,"14.csv"))
X = rbind(X, temp)
}
# we download a map of NYC that contains the different neighborhoods.
# in order to not download it at every run of this Rmarkdown, I saved it as a Rdata.
#r <- GET('http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson')
load("polygon_NYC.Rdata")
nyc_neighborhoods <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
nyc_neighborhoods_df <- tidy(nyc_neighborhoods)
## density plot: too much density in manahattan-> hide lots of data
# xx= X[which(X$Base == "B02682"),]
#ggplot() + geom_polygon(data=nyc_neighborhoods_df, aes(x=long, y=lat, group=group),color="grey",fill=NA) + stat_density2d(aes(x=xx$Lon,y=xx$Lat,fill = ..level..,alpha=0.001),geom = "polygon",h = .02, n = 300)+scale_alpha( guide = FALSE)+scale_fill_distiller(palette = 'Spectral')
# limit the data to the range of coordinates of the polygons.
X_subset = X %>% filter (Lon>min(nyc_neighborhoods_df$long) & Lon<max(nyc_neighborhoods_df$long)) %>% filter(Lat>min(nyc_neighborhoods_df$lat) & Lat<max(nyc_neighborhoods_df$lat))
ggplot() +
geom_polygon(data=nyc_neighborhoods_df, aes(x=long, y=lat, group=group),color="grey",fill=NA) + geom_bin2d(aes(x=X_subset$Lon,y=X_subset$Lat),bins=150,alpha=0.5)+scale_fill_distiller(palette = 'Spectral')+ scale_x_continuous(expand=c(0,0))+ scale_y_continuous(expand=c(0,0))
Thanks to the density plot, we first observe that pickups are concentrated in the Manhattan area and at the JFK airport.
Second, we observe that a lot of pickups are done outside of the NYC area (across the border with New Jersey). These data points should probably be removed to clarify the analysis results and the plots.
It would be interesting to compare the observations from the number of rides and market shares of the different FHV companies between the July-September 2014 period and the January-June 2015 period.
It would also give much insights to compare the location of the Uber pickups to the one of the other FHV companies, such as the Yellow Taxis. This could potentially highlights a cause of the increase in Uber rides, such as that maybe Uber drivers are more present outside of Manhattan than other FHV companies.
A work by Florian Rohart